The social deprivation index (SDI) is an effort to generate a scoring system of socioeconomic factors using US census data from the American Community Survey. The final SDI is a composite measure of percent living in poverty, percent with less than 12 years of education, percent single-parent households, the percentage living in rented housing units, the percentage living in the overcrowded housing unit, percent of households without a car, and percentage nonemployed adults under 65 years of age. For more details please see the webpage link.
In this webpage we will explore the SDI variable by start location and determine which summary statistic to use to coalesce by neighborhood.
citibike_df = read_csv("./citibike_clean/citibike_clean.csv")
head(citibike_df) |>
knitr::kable()
| bikeid | user_type | gender | age | start_time | stop_time | start_station_latitude | start_station_longitude | end_station_latitude | end_station_longitude | start_station_id | start_station_name | start_zipcode | start_uhf34_neighborhood | end_station_id | end_station_name | end_zipcode | end_uhf34_neighborhood | start_sdi_score | start_percent_overweight | start_aq | end_sdi_score | end_percent_overweight | end_aq | end_borough | start_borough |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 39213 | Subscriber | Male | 51 | 2019-09-01 00:00:01 | 2019-09-01 00:05:29 | 40.73056 | -73.97398 | 40.73222 | -73.98166 | 3733 | Avenue C & E 18 St | 10009 | Union Square, Lower Manhattan | 504 | 1 Ave & E 16 St | 10009 | Union Square, Lower Manhattan | 88 | 40.5 | 8.67 | 88 | 40.5 | 8.67 | Manhattan | Manhattan |
| 21257 | Customer | Unknown | 50 | 2019-09-01 00:00:04 | 2019-09-01 00:19:09 | 40.68292 | -73.99318 | 40.69308 | -73.97179 | 3329 | Degraw St & Smith St | 11217 | Downtown Heights Slope | 270 | Adelphi St & Myrtle Ave | 11238 | Bedford Stuyvesant Crown Heights | 74 | 50.8 | 7.44 | 70 | 62.9 | 6.61 | Brooklyn | Brooklyn |
| 15242 | Customer | Unknown | 50 | 2019-09-01 00:00:07 | 2019-09-01 00:21:40 | 40.78473 | -73.96962 | 40.76585 | -73.98691 | 3168 | Central Park West & W 85 St | 10024 | Upper West Side | 423 | W 54 St & 9 Ave | 10019 | Chelsea Village | 41 | 43.4 | 7.38 | 63 | 38.1 | 10.02 | Manhattan | Manhattan |
| 32094 | Subscriber | Male | 27 | 2019-09-01 00:00:12 | 2019-09-01 00:10:26 | 40.74620 | -73.98856 | 40.76030 | -73.99884 | 486 | Broadway & W 29 St | 10019 | Chelsea Village | 478 | 11 Ave & W 41 St | 10018 | Chelsea Village | 63 | 38.1 | 10.02 | 65 | 38.1 | 10.02 | Manhattan | Manhattan |
| 28271 | Customer | Unknown | 50 | 2019-09-01 00:00:16 | 2019-09-01 00:08:19 | 40.70201 | -73.92377 | 40.70624 | -73.93387 | 3775 | Suydam St & Knickerbocker Ave | 11237 | Williamsburg Bushwick | 3771 | McKibbin St & Bogart St | 11206 | Williamsburg Bushwick | 98 | 61.8 | 7.50 | 100 | 61.8 | 7.50 | Brooklyn | Brooklyn |
| 39424 | Customer | Unknown | 50 | 2019-09-01 00:00:17 | 2019-09-01 00:08:27 | 40.70201 | -73.92377 | 40.70624 | -73.93387 | 3775 | Suydam St & Knickerbocker Ave | 11237 | Williamsburg Bushwick | 3771 | McKibbin St & Bogart St | 11206 | Williamsburg Bushwick | 98 | 61.8 | 7.50 | 100 | 61.8 | 7.50 | Brooklyn | Brooklyn |
citibike_df |>
summarise(
mean_sdi = round(mean(start_sdi_score, na.rm = TRUE), 1),
sd_sdi = round(sd(start_sdi_score, na.rm = TRUE), 1),
median_sdi = round(median(start_sdi_score, na.rm = TRUE), 1),
q1_sdi = round(quantile(start_sdi_score, 0.25, na.rm = TRUE), 1),
q3_sdi = round(quantile(start_sdi_score, 0.75, na.rm = TRUE), 1),
iqr_sdi = round(IQR(start_sdi_score, na.rm = TRUE), 1),
max_sdi = round(max(start_sdi_score, na.rm = TRUE), 1),
min_sdi = round(min(start_sdi_score, na.rm = TRUE), 1)
)|>
pivot_longer(
cols = c("mean_sdi", "sd_sdi", "median_sdi", "q1_sdi", "q3_sdi", "iqr_sdi", "max_sdi", "min_sdi"),
names_to = "Statistic",
values_to = "Value"
)|>
knitr::kable()
| Statistic | Value |
|---|---|
| mean_sdi | 64.9 |
| sd_sdi | 21.6 |
| median_sdi | 63.0 |
| q1_sdi | 49.0 |
| q3_sdi | 88.0 |
| iqr_sdi | 39.0 |
| max_sdi | 100.0 |
| min_sdi | 26.0 |
Since SDI score is on the zipcode level, we next decide if mean or median is the better way to iterate across neighborhoods.
citibike_df |>
group_by(start_uhf34_neighborhood) |>
summarise(
mean_sdi = mean(start_sdi_score, na.rm = TRUE),
median_sdi = median(start_sdi_score, na.rm = TRUE),
max_sdi = round(max(start_sdi_score, na.rm = TRUE), 1),
min_sdi = round(min(start_sdi_score, na.rm = TRUE), 1)
)|>
knitr::kable()
| start_uhf34_neighborhood | mean_sdi | median_sdi | max_sdi | min_sdi |
|---|---|---|---|---|
| Bedford Stuyvesant Crown Heights | 83.89027 | 83 | 97 | 70 |
| Bensonhurst Bay Ridge | 66.00000 | 66 | 66 | 66 |
| Central Harlem Morningside Heights | 96.12366 | 97 | 98 | 95 |
| Chelsea Village | 58.10703 | 63 | 70 | 37 |
| Coney Island Sheepshead Bay | 78.00000 | 78 | 78 | 78 |
| Downtown Heights Slope | 71.09194 | 74 | 97 | 47 |
| East Flatbush Flatbush | 88.00000 | 88 | 88 | 88 |
| East Harlem | 100.00000 | 100 | 100 | 100 |
| East New York | 98.57200 | 99 | 99 | 97 |
| Greenpoint | 78.31831 | 88 | 88 | 61 |
| Kingsbridge Riverdale | 92.00000 | 92 | 92 | 92 |
| Long Island City Astoria | 78.60394 | 76 | 85 | 62 |
| Ridgewood Forest Hills | 75.00000 | 75 | 75 | 75 |
| Sunset Park | 95.00000 | 95 | 95 | 95 |
| Union Square, Lower Manhattan | 72.17706 | 88 | 97 | 30 |
| Upper East Side Gramercy | 40.15412 | 42 | 50 | 26 |
| Upper West Side | 50.47311 | 43 | 74 | 41 |
| Washington Heights Inwood | 97.00000 | 97 | 97 | 97 |
| West Queens | 83.50252 | 84 | 84 | 81 |
| Williamsburg Bushwick | 97.58794 | 98 | 100 | 96 |
Mean and median SDI scores are similar by start neighborhood. However median SDI scores tend to be lower and better capture the wide variation in score across neighborhood.
plot = citibike_df |>
group_by(start_zipcode) |>
summarize(median_SDI = median(start_sdi_score)) |>
ggplot(aes(x = reorder(start_zipcode, -median_SDI), y = median_SDI,)) +
geom_point() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
labs(x = "Start Zipcode", y = "Median SDI")
print(plot)

It is difficult to discern the variation in start zip codes, but we can see there is a wide range.
plot = citibike_df |>
group_by(start_uhf34_neighborhood) |>
summarize(median_SDI = median(start_sdi_score)) |>
ggplot(aes(x = reorder(start_uhf34_neighborhood, -median_SDI), y = median_SDI,)) +
geom_point() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
labs(x = "Start Neighborhood", y = "Median SDI")
print(plot)

The air quality index (AQ) originated from NYC Open Data, contributed by the Department of Hygiene and Mental Health and include air quality indexes measured across boroughs in NYC. An exploration of AQ is below.
#import data
air_quality_df = read_csv("./data/air_quality/Air_Quality_20231126.csv") |>
janitor::clean_names() |>
mutate(
start_date = mdy(start_date),
year = year(start_date)
) |>
filter(year == "2019")
joined_uhf_34_42 = read_csv("./data/geocoding/joined_uhf_34_42.csv")
# add zip codes
air_geo =
air_quality_df |>
mutate(
uhf34 = case_when(geo_type_name == "UHF34" ~ "1")
)
#break up by geocoding system, add zipcodes
# uhf34
air_zip_neighborhoods_34 =
air_geo |>
filter(uhf34 == "1") |>
select(-uhf34, - message)
air_zip_neighborhoods_34 =
left_join(air_zip_neighborhoods_34, y = (joined_uhf_34_42),
by = join_by("geo_join_id" == "uhf34"))
air_final =
air_zip_neighborhoods_34 |>
select(-uhf34_neighborhood) |>
filter(!duplicated(unique_id))
head(air_final) |>
knitr::kable()
| unique_id | indicator_id | name | measure | measure_info | geo_type_name | geo_join_id | geo_place_name | time_period | start_date | data_value | year | uhf42 | uhf42_neighborhood | zip |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 643475 | 375 | Nitrogen dioxide (NO2) | Mean | ppb | UHF34 | 207 | East Flatbush - Flatbush | Summer 2019 | 2019-06-01 | 12.75 | 2019 | 207 | East Flatbush Flatbush | 11203 |
| 667370 | 375 | Nitrogen dioxide (NO2) | Mean | ppb | UHF34 | 207 | East Flatbush - Flatbush | Winter 2019-20 | 2019-12-01 | 26.13 | 2019 | 207 | East Flatbush Flatbush | 11203 |
| 649819 | 365 | Fine particles (PM 2.5) | Mean | mcg/m3 | UHF34 | 207 | East Flatbush - Flatbush | Annual Average 2019 | 2019-01-01 | 6.31 | 2019 | 207 | East Flatbush Flatbush | 11203 |
| 649858 | 365 | Fine particles (PM 2.5) | Mean | mcg/m3 | UHF34 | 407 | Southwest Queens | Annual Average 2019 | 2019-01-01 | 6.19 | 2019 | 407 | Southwest Queens | 11414 |
| 669642 | 365 | Fine particles (PM 2.5) | Mean | mcg/m3 | UHF34 | 407 | Southwest Queens | Winter 2019-20 | 2019-12-01 | 7.40 | 2019 | 407 | Southwest Queens | 11414 |
| 649886 | 365 | Fine particles (PM 2.5) | Mean | mcg/m3 | UHF34 | 503504 | Southern SI | Summer 2019 | 2019-06-01 | 7.31 | 2019 | 503 | Willowbrook | 10314 |
Average Air Quality Index Across Time Periods in 2019
annual_aq = air_final |>
filter(time_period == "Annual Average 2019")
annual_aq|>
ggplot(aes(x = time_period, y = data_value)) +
geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7) +
labs(
x = "Time Period",
y = "Air Quality Index") +
theme_minimal()

# Calculate statistics
midpoint <- median(annual_aq$data_value)
low_quantile <- quantile(annual_aq$data_value, 0.25)
top_quantile <- quantile(annual_aq$data_value, 0.75)
Summary of data particles: Fine particles (PM 2.5), Nitrogen dioxide (NO2), and Ozone (O3)
air_final |>
group_by(name) |>
summarise(
min_value = min(data_value, na.rm = TRUE),
q1 = quantile(data_value, 0.25, na.rm = TRUE),
median_value = median(data_value, na.rm = TRUE),
q3 = quantile(data_value, 0.75, na.rm = TRUE),
max_value = max(data_value, na.rm = TRUE)
)|>
knitr::kable()
| name | min_value | q1 | median_value | q3 | max_value |
|---|---|---|---|---|---|
| Fine particles (PM 2.5) | 5.59 | 6.8825 | 7.520 | 8.1700 | 11.26 |
| Nitrogen dioxide (NO2) | 7.33 | 14.2125 | 17.855 | 22.7925 | 32.94 |
| Ozone (O3) | 24.24 | 27.9175 | 29.795 | 31.4000 | 37.44 |
air_final|>
ggplot(aes(x = data_value, fill = name)) +
geom_histogram(binwidth = 1, position = "dodge", alpha = 0.7) +
facet_wrap(~ name, scales = "free") +
labs(
x = "Data Value",
y = "Frequency",
fill = "Pollutant") +
theme_minimal()

Comparison of fine particles by neighborhood
air_particles =
air_final |>
filter(name == "Fine particles (PM 2.5)")
air_particles |>
filter(time_period == "Annual Average 2019") |>
ggplot(aes(x = reorder(geo_place_name, -data_value), y = data_value)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(
x = "Neighborhood",
y = "Mean Fine particles (mcg/m3)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

Summary statistics of fine particles by Neighborhood
air_final |>
filter(name == "Fine particles (PM 2.5)") |>
group_by(geo_place_name) |>
summarise(
min_value = min(data_value, na.rm = TRUE),
q1 = quantile(data_value, 0.25, na.rm = TRUE),
median_value = median(data_value, na.rm = TRUE),
q3 = quantile(data_value, 0.75, na.rm = TRUE),
max_value = max(data_value, na.rm = TRUE)
)|>
knitr::kable()
| geo_place_name | min_value | q1 | median_value | q3 | max_value |
|---|---|---|---|---|---|
| Bayside Little Neck-Fresh Meadows | 6.19 | 6.750 | 7.31 | 7.475 | 7.64 |
| Bedford Stuyvesant - Crown Heights | 6.61 | 7.210 | 7.81 | 7.960 | 8.11 |
| Bensonhurst - Bay Ridge | 6.21 | 6.885 | 7.56 | 7.675 | 7.79 |
| Borough Park | 6.32 | 6.905 | 7.49 | 7.700 | 7.91 |
| Canarsie - Flatlands | 6.15 | 6.500 | 6.85 | 7.280 | 7.71 |
| Central Harlem - Morningside Heights | 7.00 | 7.280 | 7.56 | 7.875 | 8.19 |
| Chelsea-Village | 10.02 | 10.555 | 11.09 | 11.175 | 11.26 |
| Coney Island - Sheepshead Bay | 6.04 | 6.510 | 6.98 | 7.310 | 7.64 |
| Downtown - Heights - Slope | 7.44 | 7.995 | 8.55 | 8.725 | 8.90 |
| East Flatbush - Flatbush | 6.31 | 6.890 | 7.47 | 7.670 | 7.87 |
| East Harlem | 7.11 | 7.470 | 7.83 | 8.050 | 8.27 |
| East New York | 6.56 | 6.980 | 7.40 | 7.710 | 8.02 |
| Flushing - Clearview | 6.61 | 7.185 | 7.76 | 7.790 | 7.82 |
| Fordham - Bronx Pk | 6.68 | 6.690 | 6.70 | 7.235 | 7.77 |
| Greenpoint | 8.56 | 8.575 | 8.59 | 9.260 | 9.93 |
| Jamaica | 6.24 | 6.845 | 7.45 | 7.465 | 7.48 |
| Kingsbridge - Riverdale | 6.37 | 6.540 | 6.71 | 7.245 | 7.78 |
| Long Island City - Astoria | 7.99 | 8.150 | 8.31 | 8.795 | 9.28 |
| Northeast Bronx | 6.66 | 6.670 | 6.68 | 7.190 | 7.70 |
| Northern SI | 6.05 | 6.680 | 7.31 | 7.435 | 7.56 |
| Pelham - Throgs Neck | 6.74 | 7.010 | 7.28 | 7.545 | 7.81 |
| Ridgewood - Forest Hills | 6.56 | 7.130 | 7.70 | 7.805 | 7.91 |
| Rockaways | 5.59 | 5.855 | 6.12 | 6.570 | 7.02 |
| South Bronx | 7.28 | 7.305 | 7.33 | 7.890 | 8.45 |
| Southeast Queens | 6.01 | 6.585 | 7.16 | 7.170 | 7.18 |
| Southern SI | 5.87 | 6.125 | 6.38 | 6.845 | 7.31 |
| Southwest Queens | 6.19 | 6.795 | 7.40 | 7.470 | 7.54 |
| Sunset Park | 7.32 | 7.745 | 8.17 | 8.515 | 8.86 |
| Union Square-Lower Manhattan | 8.67 | 9.290 | 9.91 | 10.045 | 10.18 |
| Upper East Side-Gramercy | 8.98 | 9.345 | 9.71 | 9.890 | 10.07 |
| Upper West Side | 7.38 | 7.625 | 7.87 | 8.210 | 8.55 |
| Washington Heights | 7.11 | 7.230 | 7.35 | 7.760 | 8.17 |
| West Queens | 7.35 | 7.795 | 8.24 | 8.425 | 8.61 |
| Williamsburg - Bushwick | 7.50 | 7.895 | 8.29 | 8.625 | 8.96 |
Comparison of NO2 by neighborhood
air_nitrogen =
air_final |>
filter(name == "Nitrogen dioxide (NO2)")
air_nitrogen |>
filter(time_period == "Annual Average 2019") |>
ggplot(aes(x = reorder(geo_place_name, -data_value), y = data_value)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(x = "Neighborhood",
y = "Mean Nitrogen dioxide (ppb)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

Summary statistics of NO2 by Neighborhood
air_final |>
filter(name == "Nitrogen dioxide (NO2)") |>
group_by(geo_place_name) |>
summarise(
min_value = min(data_value, na.rm = TRUE),
q1 = quantile(data_value, 0.25, na.rm = TRUE),
median_value = median(data_value, na.rm = TRUE),
q3 = quantile(data_value, 0.75, na.rm = TRUE),
max_value = max(data_value, na.rm = TRUE)
) |>
knitr::kable()
| geo_place_name | min_value | q1 | median_value | q3 | max_value |
|---|---|---|---|---|---|
| Bayside Little Neck-Fresh Meadows | 12.56 | 13.605 | 14.65 | 17.805 | 20.96 |
| Bedford Stuyvesant - Crown Heights | 14.34 | 16.015 | 17.69 | 22.635 | 27.58 |
| Bensonhurst - Bay Ridge | 13.00 | 14.350 | 15.70 | 19.725 | 23.75 |
| Borough Park | 13.58 | 15.120 | 16.66 | 21.150 | 25.64 |
| Canarsie - Flatlands | 9.24 | 11.190 | 13.14 | 18.000 | 22.86 |
| Central Harlem - Morningside Heights | 16.78 | 18.240 | 19.70 | 23.150 | 26.60 |
| Chelsea-Village | 24.12 | 24.660 | 25.20 | 29.070 | 32.94 |
| Coney Island - Sheepshead Bay | 9.97 | 11.735 | 13.50 | 17.800 | 22.10 |
| Downtown - Heights - Slope | 17.99 | 18.985 | 19.98 | 24.310 | 28.64 |
| East Flatbush - Flatbush | 12.75 | 14.510 | 16.27 | 21.200 | 26.13 |
| East Harlem | 16.73 | 18.065 | 19.40 | 23.080 | 26.76 |
| East New York | 11.97 | 13.890 | 15.81 | 20.655 | 25.50 |
| Flushing - Clearview | 13.61 | 14.935 | 16.26 | 19.590 | 22.92 |
| Fordham - Bronx Pk | 12.17 | 13.670 | 15.17 | 18.880 | 22.59 |
| Greenpoint | 19.22 | 20.310 | 21.40 | 25.335 | 29.27 |
| Jamaica | 13.06 | 14.285 | 15.51 | 18.900 | 22.29 |
| Kingsbridge - Riverdale | 10.39 | 11.945 | 13.50 | 17.385 | 21.27 |
| Long Island City - Astoria | 17.72 | 18.980 | 20.24 | 23.930 | 27.62 |
| Northeast Bronx | 12.28 | 13.475 | 14.67 | 18.290 | 21.91 |
| Northern SI | 12.41 | 13.290 | 14.17 | 17.585 | 21.00 |
| Pelham - Throgs Neck | 12.80 | 14.135 | 15.47 | 18.845 | 22.22 |
| Ridgewood - Forest Hills | 12.77 | 14.505 | 16.24 | 20.750 | 25.26 |
| Rockaways | 7.33 | 8.685 | 10.04 | 13.805 | 17.57 |
| South Bronx | 15.15 | 16.785 | 18.42 | 21.845 | 25.27 |
| Southeast Queens | 12.99 | 13.745 | 14.50 | 17.295 | 20.09 |
| Southern SI | 8.76 | 9.865 | 10.97 | 13.755 | 16.54 |
| Southwest Queens | 11.69 | 13.415 | 15.14 | 19.530 | 23.92 |
| Sunset Park | 16.21 | 17.335 | 18.46 | 22.410 | 26.36 |
| Union Square-Lower Manhattan | 21.16 | 21.775 | 22.39 | 26.550 | 30.71 |
| Upper East Side-Gramercy | 22.29 | 23.170 | 24.05 | 27.945 | 31.84 |
| Upper West Side | 18.84 | 19.820 | 20.80 | 24.455 | 28.11 |
| Washington Heights | 14.03 | 15.650 | 17.27 | 20.940 | 24.61 |
| West Queens | 15.58 | 17.125 | 18.67 | 22.695 | 26.72 |
| Williamsburg - Bushwick | 16.66 | 18.165 | 19.67 | 24.185 | 28.70 |
Comparison of O3 by neighborhood
air_final |>
filter(name == "Ozone (O3)") |>
ggplot(aes(x = reorder(geo_place_name, -data_value), y = data_value)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(
x = "Neighborhood",
y = "Mean Ozone (ppb)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

Summary statistics of O3 by Neighborhood
air_final |>
filter(name == "Ozone (O3)") |>
group_by(geo_place_name) |>
summarise(
min_value = min(data_value, na.rm = TRUE),
q1 = quantile(data_value, 0.25, na.rm = TRUE),
median_value = median(data_value, na.rm = TRUE),
q3 = quantile(data_value, 0.75, na.rm = TRUE),
max_value = max(data_value, na.rm = TRUE)
)|>
knitr::kable()
| geo_place_name | min_value | q1 | median_value | q3 | max_value |
|---|---|---|---|---|---|
| Bayside Little Neck-Fresh Meadows | 31.73 | 31.73 | 31.73 | 31.73 | 31.73 |
| Bedford Stuyvesant - Crown Heights | 30.46 | 30.46 | 30.46 | 30.46 | 30.46 |
| Bensonhurst - Bay Ridge | 30.83 | 30.83 | 30.83 | 30.83 | 30.83 |
| Borough Park | 30.62 | 30.62 | 30.62 | 30.62 | 30.62 |
| Canarsie - Flatlands | 34.05 | 34.05 | 34.05 | 34.05 | 34.05 |
| Central Harlem - Morningside Heights | 27.07 | 27.07 | 27.07 | 27.07 | 27.07 |
| Chelsea-Village | 24.24 | 24.24 | 24.24 | 24.24 | 24.24 |
| Coney Island - Sheepshead Bay | 33.33 | 33.33 | 33.33 | 33.33 | 33.33 |
| Downtown - Heights - Slope | 27.61 | 27.61 | 27.61 | 27.61 | 27.61 |
| East Flatbush - Flatbush | 31.43 | 31.43 | 31.43 | 31.43 | 31.43 |
| East Harlem | 27.51 | 27.51 | 27.51 | 27.51 | 27.51 |
| East New York | 32.61 | 32.61 | 32.61 | 32.61 | 32.61 |
| Flushing - Clearview | 31.31 | 31.31 | 31.31 | 31.31 | 31.31 |
| Fordham - Bronx Pk | 29.64 | 29.64 | 29.64 | 29.64 | 29.64 |
| Greenpoint | 27.77 | 27.77 | 27.77 | 27.77 | 27.77 |
| Jamaica | 32.83 | 32.83 | 32.83 | 32.83 | 32.83 |
| Kingsbridge - Riverdale | 28.43 | 28.43 | 28.43 | 28.43 | 28.43 |
| Long Island City - Astoria | 28.36 | 28.36 | 28.36 | 28.36 | 28.36 |
| Northeast Bronx | 30.34 | 30.34 | 30.34 | 30.34 | 30.34 |
| Northern SI | 28.46 | 28.46 | 28.46 | 28.46 | 28.46 |
| Pelham - Throgs Neck | 30.87 | 30.87 | 30.87 | 30.87 | 30.87 |
| Ridgewood - Forest Hills | 31.30 | 31.30 | 31.30 | 31.30 | 31.30 |
| Rockaways | 37.44 | 37.44 | 37.44 | 37.44 | 37.44 |
| South Bronx | 29.16 | 29.16 | 29.16 | 29.16 | 29.16 |
| Southeast Queens | 33.41 | 33.41 | 33.41 | 33.41 | 33.41 |
| Southern SI | 29.64 | 29.64 | 29.64 | 29.64 | 29.64 |
| Southwest Queens | 33.31 | 33.31 | 33.31 | 33.31 | 33.31 |
| Sunset Park | 28.83 | 28.83 | 28.83 | 28.83 | 28.83 |
| Union Square-Lower Manhattan | 25.41 | 25.41 | 25.41 | 25.41 | 25.41 |
| Upper East Side-Gramercy | 24.98 | 24.98 | 24.98 | 24.98 | 24.98 |
| Upper West Side | 25.59 | 25.59 | 25.59 | 25.59 | 25.59 |
| Washington Heights | 27.73 | 27.73 | 27.73 | 27.73 | 27.73 |
| West Queens | 29.95 | 29.95 | 29.95 | 29.95 | 29.95 |
| Williamsburg - Bushwick | 29.34 | 29.34 | 29.34 | 29.34 | 29.34 |
The overweight data contains publicly available data from NYC regarding the percent of people who are overweight per area. An exploration of this dataset is below.
overweight_data = citibike_df |>
select(start_station_latitude,
start_station_longitude,
start_uhf34_neighborhood,
start_zipcode,
end_station_latitude,
end_station_longitude,
end_uhf34_neighborhood,
start_borough, end_borough,
end_zipcode,
start_percent_overweight,
end_percent_overweight) |>
mutate(neighborhood = coalesce(end_uhf34_neighborhood, start_uhf34_neighborhood),
lat = coalesce(end_station_latitude, start_station_latitude),
long = coalesce(end_station_longitude, start_station_longitude),
borough = coalesce(start_borough, end_borough),
zipcode = coalesce(start_zipcode, end_zipcode),
overweight = coalesce(start_percent_overweight, end_percent_overweight)) |>
unique()
head(overweight_data) |>
knitr::kable()
| start_station_latitude | start_station_longitude | start_uhf34_neighborhood | start_zipcode | end_station_latitude | end_station_longitude | end_uhf34_neighborhood | start_borough | end_borough | end_zipcode | start_percent_overweight | end_percent_overweight | neighborhood | lat | long | borough | zipcode | overweight |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 40.73056 | -73.97398 | Union Square, Lower Manhattan | 10009 | 40.73222 | -73.98166 | Union Square, Lower Manhattan | Manhattan | Manhattan | 10009 | 40.5 | 40.5 | Union Square, Lower Manhattan | 40.73222 | -73.98166 | Manhattan | 10009 | 40.5 |
| 40.68292 | -73.99318 | Downtown Heights Slope | 11217 | 40.69308 | -73.97179 | Bedford Stuyvesant Crown Heights | Brooklyn | Brooklyn | 11238 | 50.8 | 62.9 | Bedford Stuyvesant Crown Heights | 40.69308 | -73.97179 | Brooklyn | 11217 | 50.8 |
| 40.78473 | -73.96962 | Upper West Side | 10024 | 40.76585 | -73.98691 | Chelsea Village | Manhattan | Manhattan | 10019 | 43.4 | 38.1 | Chelsea Village | 40.76585 | -73.98691 | Manhattan | 10024 | 43.4 |
| 40.74620 | -73.98856 | Chelsea Village | 10019 | 40.76030 | -73.99884 | Chelsea Village | Manhattan | Manhattan | 10018 | 38.1 | 38.1 | Chelsea Village | 40.76030 | -73.99884 | Manhattan | 10019 | 38.1 |
| 40.70201 | -73.92377 | Williamsburg Bushwick | 11237 | 40.70624 | -73.93387 | Williamsburg Bushwick | Brooklyn | Brooklyn | 11206 | 61.8 | 61.8 | Williamsburg Bushwick | 40.70624 | -73.93387 | Brooklyn | 11237 | 61.8 |
| 40.79127 | -73.96484 | Upper West Side | 10025 | 40.75020 | -73.99093 | Chelsea Village | Manhattan | Manhattan | 10001 | 43.4 | 38.1 | Chelsea Village | 40.75020 | -73.99093 | Manhattan | 10025 | 43.4 |
overweight_data |>
summarize(
mean = mean(overweight, na.rm = TRUE),
min = min(overweight, na.rm = TRUE),
max = max(overweight, na.rm = TRUE),
median = median(overweight, na.rm = TRUE),
std = sd(overweight, na.rm = TRUE)
) |>
knitr::kable()
| mean | min | max | median | std |
|---|---|---|---|---|
| 46.86309 | 36.5 | 71.2 | 41.1 | 10.86188 |
percent_obese =
overweight_data |>
ggplot(aes(x = reorder(zipcode, -overweight), y = overweight,)) +
geom_point() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
labs(title = "Percent of adults classified as overweight or obese, by area", x = "Location", y = "Percent")
print(percent_obese)

percent_obese =
overweight_data |>
group_by(neighborhood) |>
summarize(overweight = mean(overweight)) |>
ggplot(aes(x = reorder(neighborhood, -overweight), y = overweight,)) +
geom_point() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
labs(title = "Percent of adults classified as overweight or obese, by area", x = "Location", y = "Percent")
print(percent_obese)

# Create a leaflet map
# Select the necessary columns
map_data <- overweight_data %>%
select(lat, long, overweight)
map <- leaflet(data = map_data) %>%
addTiles() # Add map tiles (you can use different tile providers)
# Add color-coded circles based on overweight percentage
map <- map %>%
addCircleMarkers(
radius = 5, # Adjust the circle size as needed
fillColor = ~colorFactor("Blues", map_data$overweight)(overweight),
color = "black",
fillOpacity = 0.7,
popup = ~paste("Overweight Percentage:", overweight, "%"),
label = ~paste("Overweight Percentage:", overweight, "%")
)
# Display the map
map